# -*- coding: utf-8 -*-

import requests
from bs4 import BeautifulSoup
from goose3 import Goose 
from goose3.text import StopWordsChinese
import jieba


#requests库 get请求
resp = requests.get("http://www.baidu.com")
print(resp.status_code)
print(resp.content)

#request库 post请求
url = "https://fanyi.baidu.com/sug"
data = {
    "kw":"test"
}
resp = requests.post(url=url, data=data)
print(resp.json())

#获取新浪页面示例

headers={"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"}
resp = requests.get(url="https://www.sina.com.cn/",headers=headers)
soup = BeautifulSoup(resp.content, "html.parser")
#获取头条新闻
tar_list = soup.select("ul.list-a.news_top li a")
for item in tar_list:
    print(item.text.strip())
    print(item.get('href'))
tar_url = tar_list[0].get('href')
print(tar_url)
#通过goose获取文章内容
g = Goose({'browser_user_agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36','stopwords_class': StopWordsChinese})
article = g.extract(url=tar_url)
print(article.title)
print(article.cleaned_text)
tar_content = article.cleaned_text
#利用jieba对文章分词
cut_list = jieba.lcut(tar_content)
print(cut_list)